Intial Grouped Feature Importance

Group Feature Importance from Intial Tuned Models

Author

Jacob Johnson

Published

September 29, 2025

########################################
########################################
## 02.1-Initial_GPFI.qmd              ##
## Jacob Johnson                      ##
## ---------------------------------- ##
## Quarto document to summarize the   ##
## feature importance results from    ##
## the models with the best RMSE from ##
## the tuning done in 01              ##
## - Loads feature importance results ##
## - Plots feature importance         ##
########################################
########################################

Set Up

## Load packages
library(dplyr)
library(ggplot2)
library(purrr)
library(stringr)
library(gridExtra)
library(tidyr)
library(wesanderson)

Specify file path:

## Path for working locally
local_fp = "//cee/projects/subseasonal_extreme/"
## Path for working on the CEE Compute Server
cee_fp = "/projects/subseasonal_extreme/"

## Select which path to use
fp = local_fp

Data Steps

Load climate inputs:

## Variables identified by Maike as redundant
redundant_cols <-
  c(
    'slp_weekave_atlantic_ocean_mean',
    'h_850_weekave_conus_mean',
    'h_850_weekave_atlantic_ocean_pc4',
    'h_850_weekave_mexico_gulf_mean',
    'slp_weekave_mexico_gulf_mean',
    'slp_weekave_conus_pc1',
    'ts_weekave_conus_pc1',
    'slp_weekave_atlantic_trop_mean',
    'ts_weekave_southern_canada_pc1',
    't_850_weekave_southern_canada_mean',
    'h_850_weekave_atlantic_ocean_pc5',
    'slp_weekave_atlantic_ocean_pc6',
    'h_500_weekave_pacific_trop_mean',
    'h_850_weekave_atlantic_trop_mean',
    't_500_weekave_pacific_trop_mean',
    't_500_weekave_atlantic_trop_pc1',
    'h_850_weekave_pacific_ocean_pc1',
    't_850_weekave_conus_pc1',
    'slp_weekave_atlantic_ocean_pc1',
    'slp_weekave_pacific_trop_mean',
    't_850_weekave_southern_canada_pc1',
    'h_850_weekave_arctic_pc1',
    'h_500_weekave_atlantic_trop_mean',
    'h_850_weekave_atlantic_ocean_pc2',
    'slp_weekave_atlantic_ocean_pc3',
    'h_850_weekave_atlantic_ocean_pc7',
    't_500_weekave_mexico_gulf_mean',
    't_500_weekave_southern_canada_pc1',
    'slp_weekave_pacific_trop_pc2',
    'h_850_weekave_arctic_mean',
    'ts_weekave_atlantic_trop_mean',
    'h_850_weekave_pacific_trop_mean'
  )

climate_inputs <-
  readr::read_csv(
    paste0(fp, "analysis-data/weekly_aves_regional_inputs_1980_2020.csv"),
    show_col_types = FALSE
  ) |>
  mutate(date = as.character(date)) |>
  separate(date, into = c("year", "week"), sep = 4, remove = FALSE) |>
  mutate(
    year = as.numeric(year),
    week = as.numeric(week)
  ) |>
  select(-all_of(redundant_cols))

Prepared variable names to attach to feature importance results (warning is okay and is due to separation of variable with different lengths):

input_vars <- 
  data.frame(var = c(names(climate_inputs)[-c(1:3)], "temp")) |> 
  mutate(var = str_remove(var, "weekave_")) |>
  separate(
    var,
    into = c("var1", "var2", "var3", "var4", "var5"),
    remove = FALSE
  ) |>
  rename(climate_var = var1) |>
  mutate(
    climate_var = ifelse(
      var2 %in% c("500", "850"), 
      paste0(climate_var, var2), 
      climate_var
  )) |>
  mutate(
    var2 = ifelse(
      var2 %in% c("500", "850"), 
      NA, 
      var2
  )) |> 
  rename(input_region = var2) |>
  mutate(
    var6 = ifelse(
      str_detect(var3, "pc"),
      var3, 
      ifelse(
        str_detect(var3, "mean"), 
        var3, 
        NA
      )
    )
  ) |>
  mutate(
    var3 = ifelse(
      str_detect(var3, "pc"),
      NA, 
      ifelse(
        str_detect(var3, "mean"), 
        NA, 
        var3
      )
    )
  ) |>
  mutate(
    input_region = ifelse(
      is.na(var3),
      input_region, 
      ifelse(
        is.na(input_region),
        var3, 
        paste0(input_region, "_", var3) 
      )
  )) |>
  select(-var3) |>
  mutate(
    var7 = ifelse(
      str_detect(var4, "pc"),
      var4, 
      ifelse(
        str_detect(var4, "mean"), 
        var4, 
        NA
      )
    )
  ) |>
  mutate(
    var4 = ifelse(
      str_detect(var4, "pc"),
      NA, 
      ifelse(
        str_detect(var4, "mean"), 
        NA, 
        var4
      )
    )
  ) |>
  mutate(
    input_region = ifelse(
      is.na(var4),
      input_region,  
      ifelse(
        is.na(input_region),
        var4, 
        paste0(input_region, "_", var4) 
      )
  )) |>
  select(-var4) |>
  rename(stat = var5) |>
  mutate(
    stat = ifelse(
      is.na(stat) & is.na(var6), 
      var7,
      ifelse(
        is.na(stat) & is.na(var7),
        var6, 
        stat
      )
  )) |>
  select(-var6, -var7) |>
  mutate(
    input_region = ifelse(is.na(input_region), "temp", input_region),
    stat = ifelse(is.na(stat), "temp", stat)
  ) |>
  mutate(var_id = 1:n()) |> 
  select(var_id, everything())
Warning: Expected 5 pieces. Missing pieces filled with `NA` in 531 rows [1, 3, 12, 13,
14, 15, 16, 17, 20, 24, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, ...].

FI Results

Determine files containing ESN feature importance results:

fi_fp = paste0(fp, "agu-manuscript-code/EESNs/02-grouped-feature-importance/results/GPFI-nreps01/")
fi_files = list.files(fi_fp)
fi_files = fi_files[str_detect(fi_files, "targetregion")]

Load in groups from Maike:

`%ni%` <- Negate(`%in%`)
clusters <-  readr::read_csv(paste0(
  fp, "agu-manuscript-code/EESNs/02-grouped-feature-importance/02.1-InitialFeatureImportance/All_GPFI_Merged.csv")) |>
  select(Group, Feature, Region, Horizon) |> 
  mutate(middle = substr(Feature, 3, 6)) |> 
  filter(middle %ni% c("_lag", "lag_")) |> 
  select(-middle)
Rows: 22160 Columns: 7
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Feature, Region
dbl (5): Group, Mean_Importance, Std_Importance, Group_Size, Horizon

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Join Feature Importance with Groups

input_vars_with_groups <- 
  clusters |>
  mutate(var = str_remove_all(Feature, "_weekave")) |> 
  rename(tau = Horizon,
         region = Region) |> 
  left_join(input_vars, by="var") |> 
  select(-Feature, -var_id)

Load ESN feature importance results:

fi_df <-
  set_names(fi_files) |>
  map(
    .f = function(x)
      read.csv(paste0(fi_fp, x)) |> mutate(Group = 1:n())
  ) |>
  list_rbind(names_to = "file") |> 
  mutate(file = str_remove(file, ".csv")) |>
  separate(file, into = c("region", "tau", "m", "nh", "nu")) |>
  mutate(
    region = str_remove(region, "targetregion"),
    tau = as.numeric(str_remove(tau, "tau")),
    m = as.numeric(str_remove(m, "m")),
    nh = as.numeric(str_remove(nh, "nh")),
    nu = as.numeric(str_remove(nu, "nu")), 
    fi = -fi
  ) |> 
  left_join(input_vars_with_groups, by = c("Group", "tau", "region")) |>
  filter(!is.na(fi)) |> 
  select(region:nu, Group:stat, fi)

Create ordered version of feature importance:

fi_ordered <- 
  fi_df |> 
  group_by(region, tau) |> 
  arrange(region, tau, desc(fi)) |> 
  mutate(rank = dense_rank(desc(fi))) |> 
  ungroup() 

Save data:

write.csv(
  fi_ordered,
  "../results/gpfi-initial.csv",
  row.names = FALSE
)

Most important Groups by Region and Horizon

## Subtitles
subtitles <- fi_df |> 
  summarise(m = mean(m),
            nu = mean(nu),
            nh = mean(nh),
            .by = c(region, tau)) |> 
  mutate(st = paste0("m:", m, " nu:", nu, " nh:", nh))

## Vars in Groups
groupvars <- fi_df |> 
  group_by(region, tau, Group) |> 
  summarise(groupvars = paste(var, collapse = ",\n")) 
`summarise()` has grouped output by 'region', 'tau'. You can override using the
`.groups` argument.
## Ordered df
fi_ave_ordered <-
  fi_ordered |> 
  summarise(
    fi_ave = mean(fi), 
    fi_min = min(fi),
    fi_max = max(fi),
    .by = c(region, tau, Group)
  ) |> 
  group_by(region, tau) |>
  arrange(tau, desc(fi_ave)) |>
  mutate(rank = dense_rank(desc(fi_ave))) |> 
  ungroup() |> 
  left_join(subtitles, by=c("region", "tau")) |> 
  left_join(groupvars, by=c("region", "tau", "Group"))

## Max FI
fi_max <- fi_ave_ordered |>
  na.omit() |> 
  group_by(region) |> 
  summarise(plot_max = max(fi_ave, na.omit=F))

MW

NE

SE

SW

W

Averaged (over forecast region)

Compute average FI for each forecast horizon and input variable (averaged over forecast region):

fi_ave_ordered <-
  fi_ordered |> 
  summarise(
    fi_ave = mean(fi), 
    fi_min = min(fi),
    fi_max = max(fi),
    .by = c(tau, var)
  ) |> 
  group_by(tau) |>
  arrange(tau, desc(fi_ave)) |>
  mutate(rank = row_number()) |> 
  ungroup()

Extract the 25 variables with the highest average feature importance (for each horizon):

fi_ave_ordered_sub1 <-
  fi_ave_ordered |>
  filter(rank <= 25, tau == 1) |>
  mutate(rank = factor(rank, levels = 25:1))

fi_ave_ordered_sub2 <-
  fi_ave_ordered |>
  filter(rank <= 25, tau == 2) |>
  mutate(rank = factor(rank, levels = 25:1))

fi_ave_ordered_sub3 <-
  fi_ave_ordered |>
  filter(rank <= 25, tau == 3) |>
  mutate(rank = factor(rank, levels = 25:1))

fi_ave_ordered_sub4 <-
  fi_ave_ordered |>
  filter(rank <= 25, tau == 4) |>
  mutate(rank = factor(rank, levels = 25:1))

Plots of average FI plus/minus minimum and maximum FI values (averaged over forecast region):

Averaged (over forecast region, input region, and stat)

Compute average FI by forecast horizon and input climate variable:

fi_ordered_ave_by_tau_climvar <-
  fi_ordered |> 
  summarise(
    fi_ave = mean(fi), 
    fi_min = min(fi),
    fi_max = max(fi),
    .by = c(tau, climate_var)
  ) |> 
  group_by(tau) |>
  arrange(tau, desc(fi_ave)) |>
  mutate(rank = row_number()) |> 
  ungroup()

Extract the 25 variables with the highest average feature importance (for each horizon):

fi_ordered_ave_by_tau_climvar_sub1 <-
  fi_ordered_ave_by_tau_climvar |>
  filter(rank <= 25, tau == 1) |>
  mutate(rank = factor(rank, levels = 25:1))

fi_ordered_ave_by_tau_climvar_sub2 <-
  fi_ordered_ave_by_tau_climvar |>
  filter(rank <= 25, tau == 2) |>
  mutate(rank = factor(rank, levels = 25:1))

fi_ordered_ave_by_tau_climvar_sub3 <-
  fi_ordered_ave_by_tau_climvar |>
  filter(rank <= 25, tau == 3) |>
  mutate(rank = factor(rank, levels = 25:1))

fi_ordered_ave_by_tau_climvar_sub4 <-
  fi_ordered_ave_by_tau_climvar |>
  filter(rank <= 25, tau == 4) |>
  mutate(rank = factor(rank, levels = 25:1))

Plots of average FI plus/minus minimum and maximum FI values (averaged over forecast region):

FI by Variable

Input region:

Statistic:

Forecast region:

Climate variable:

Top 100

Top 50

Top 25

Top 100 (marginal)

Top 50 (marginal)

Top 25 (marginal)